df <- read.csv("/Users/austinra/BIOS6624/Project 0/Data Raw/Project0_Clean_v2.csv")
EDA:
glimpse(df)
## Rows: 372
## Columns: 15
## $ SubjectID <int> 3012, 3012, 3012, 3012, 3…
## $ Collection.Date <chr> "10/2/2018", "10/2/2018",…
## $ Collection.Sample <int> 1, 2, 3, 4, 1, 2, 3, 4, 1…
## $ Booket..Clock.Time <chr> "8:54", "9:38", "12:31", …
## $ MEMs..Clock.Time <chr> "8:55", "9:38", "12:30", …
## $ Sleep.Diary.reported.wake.time <chr> "8:54", "", "", "", "7:20…
## $ Booklet..Sample.interval <chr> "", "0:44:00", "3:37:00",…
## $ Booklet..Sample.interval.Decimal.Time..mins. <int> 0, 44, 217, 644, 0, 37, 3…
## $ MEMs..Sample.interval <chr> "", "0:43:00", "3:35:00",…
## $ MEMs..Sample.interval.Decimal.Time..mins. <int> NA, 43, 215, 643, NA, 36,…
## $ Cortisol..ug.dl. <dbl> 0.142, 0.259, 0.084, 0.02…
## $ DHEA..pg.dl. <dbl> 449.743, 152.798, 94.410,…
## $ Cortisol..nmol.L. <dbl> 3.91778, 7.14581, 2.31756…
## $ DHEA..nmol.L. <dbl> 1.5606082, 0.5302091, 0.3…
## $ DAYNUMB <int> 1, 1, 1, 1, 2, 2, 2, 2, 3…
#Examine which values are blank and which are NA
df %>%
summarise(
across(
everything(),
list(
blank = ~ sum(str_trim(as.character(.x)) == "", na.rm = TRUE),
na = ~ sum(is.na(.x))
)
)
)
## SubjectID_blank SubjectID_na Collection.Date_blank Collection.Date_na
## 1 0 0 0 0
## Collection.Sample_blank Collection.Sample_na Booket..Clock.Time_blank
## 1 0 0 35
## Booket..Clock.Time_na MEMs..Clock.Time_blank MEMs..Clock.Time_na
## 1 0 61 0
## Sleep.Diary.reported.wake.time_blank Sleep.Diary.reported.wake.time_na
## 1 279 0
## Booklet..Sample.interval_blank Booklet..Sample.interval_na
## 1 130 0
## Booklet..Sample.interval.Decimal.Time..mins._blank
## 1 0
## Booklet..Sample.interval.Decimal.Time..mins._na MEMs..Sample.interval_blank
## 1 55 177
## MEMs..Sample.interval_na MEMs..Sample.interval.Decimal.Time..mins._blank
## 1 0 0
## MEMs..Sample.interval.Decimal.Time..mins._na Cortisol..ug.dl._blank
## 1 177 0
## Cortisol..ug.dl._na DHEA..pg.dl._blank DHEA..pg.dl._na
## 1 0 0 5
## Cortisol..nmol.L._blank Cortisol..nmol.L._na DHEA..nmol.L._blank
## 1 0 5 0
## DHEA..nmol.L._na DAYNUMB_blank DAYNUMB_na
## 1 5 0 0
#35 blank Booket..Clock.Time
#61 blank MEMs..Clock.Time
#279 blank Sleep.Diary.reported.wake.time
#5 Cortisol..nmol.L._na, 5 DHEA..nmol.L._na
Check each SubjectID has proper number of samples
problem_check <- df %>% group_by(SubjectID, DAYNUMB) %>% summarize(n = n()) %>% filter(n > 4)
## `summarise()` has grouped output by 'SubjectID'. You can override using the
## `.groups` argument.
#Subject 3029 has 12 observations on day 3, assign new DAYNUMB based on date for this subject
df_mod <- df %>%
mutate(Collection.Date = as.Date(Collection.Date, format = "%m/%d/%Y")) %>%
mutate(
DAYNUMB = case_when(
SubjectID == 3029 & Collection.Date == "2018-10-07" ~ 1,
SubjectID == 3029 & Collection.Date == "2018-10-10" ~ 2,
SubjectID == 3029 & Collection.Date == "2018-10-12" ~ 3,
TRUE ~ DAYNUMB
)
) %>% ungroup()
Modify variable types
#SubjectID needs to be Categorical
df_mod$SubjectID <- as.character(df_mod$SubjectID)
#Collection.Sample should be Categorical
df_mod$Collection.Sample <- as.character(df_mod$Collection.Sample)
#Booket..Clock.Time should be Time
df_mod$Booket..Clock.Time <-hm(df_mod$Booket..Clock.Time)
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings failed to
## parse
#MEMs..Clock.Time should be Time
df_mod$MEMs..Clock.Time <- hm(df_mod$MEMs..Clock.Time)
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings failed to
## parse
#Sleep.Diary.reported.wake.time should be Time
df_mod$Sleep.Diary.reported.wake.time <- hm(df_mod$Sleep.Diary.reported.wake.time)
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings failed to
## parse
#Sleep.Diary.reported.wake.time carried down for each SubjectID DAYNUMB combination
df_mod <- df_mod %>%
arrange(
SubjectID,
DAYNUMB,
suppressWarnings(as.integer(Collection.Sample))
) %>%
group_by(SubjectID, DAYNUMB) %>%
tidyr::fill(Sleep.Diary.reported.wake.time, .direction = "down") %>%
ungroup()
#Sample Intervals should not be used in the analysis - calculate own values #Booklet..Sample.interval is the Number of hours and minutes from waking recorded by participant formatted as hh:mm where waking was recorded as midnight 00:00 #Booklet..Sample.interval.Decimal.Time..mins. integer representation of previous variable aka number of minutes from waking for the sample as recorded by participant. # MEMs..Sample.interval Number of hours and minutes from waking electronic cap stamp formatted as hh:mm where waking was recorded as midnight 00:00 #MEMs..Sample.interval.Decimal.Time..mins. integer representation of previous variable aka number of minutes from waking electronic stamp.
# Convert times to minutes since midnight
df_mod$booklet_mins <- lubridate::hour(df_mod$Booket..Clock.Time) * 60 +
lubridate::minute(df_mod$Booket..Clock.Time)
df_mod$cap_mins <- lubridate::hour(df_mod$MEMs..Clock.Time) * 60 +
lubridate::minute(df_mod$MEMs..Clock.Time)
df_mod$wake_mins <- lubridate::hour(df_mod$Sleep.Diary.reported.wake.time) * 60 +
lubridate::minute(df_mod$Sleep.Diary.reported.wake.time)
#Creating new variables capturing elapsed time between booklet and wake time AND cap and wake time respectively
df_mod$cap_interval_mins <-
df_mod$cap_mins - df_mod$wake_mins
df_mod$booklet_interval_mins <-
df_mod$booklet_mins - df_mod$wake_mins
glimpse(df_mod)
## Rows: 372
## Columns: 20
## $ SubjectID <chr> "3012", "3012", "3012", "…
## $ Collection.Date <date> 2018-10-02, 2018-10-02, …
## $ Collection.Sample <chr> "1", "2", "3", "4", "1", …
## $ Booket..Clock.Time <Period> 8H 54M 0S, 9H 38M 0S, …
## $ MEMs..Clock.Time <Period> 8H 55M 0S, 9H 38M 0S, …
## $ Sleep.Diary.reported.wake.time <Period> 8H 54M 0S, 8H 54M 0S, …
## $ Booklet..Sample.interval <chr> "", "0:44:00", "3:37:00",…
## $ Booklet..Sample.interval.Decimal.Time..mins. <int> 0, 44, 217, 644, 0, 37, 3…
## $ MEMs..Sample.interval <chr> "", "0:43:00", "3:35:00",…
## $ MEMs..Sample.interval.Decimal.Time..mins. <int> NA, 43, 215, 643, NA, 36,…
## $ Cortisol..ug.dl. <dbl> 0.142, 0.259, 0.084, 0.02…
## $ DHEA..pg.dl. <dbl> 449.743, 152.798, 94.410,…
## $ Cortisol..nmol.L. <dbl> 3.91778, 7.14581, 2.31756…
## $ DHEA..nmol.L. <dbl> 1.5606082, 0.5302091, 0.3…
## $ DAYNUMB <dbl> 1, 1, 1, 1, 2, 2, 2, 2, 3…
## $ booklet_mins <dbl> 534, 578, 751, 1178, 440,…
## $ cap_mins <dbl> 535, 578, 750, 1178, 441,…
## $ wake_mins <dbl> 534, 534, 534, 534, 440, …
## $ cap_interval_mins <dbl> 1, 44, 216, 644, 1, 37, 3…
## $ booklet_interval_mins <dbl> 0, 44, 217, 644, 0, 37, 3…
Save processed data
#write.xlsx(df_mod, "/Users/austinra/BIOS6624/Project 0/Data Processed/Project0_ProcessedData.xlsx")
Table One Group by Sample Time, Display
myVars <- c("Cortisol..nmol.L.", "DHEA..nmol.L.", "cap_interval_mins", "booklet_interval_mins")
tab1 <-CreateTableOne(vars = myVars, data =df_mod, strata = "Collection.Sample")
tab1
## Stratified by Collection.Sample
## 1 2 3
## n 93 93 93
## Cortisol..nmol.L. (mean (SD)) 7.38 (6.05) 9.00 (4.94) 3.21 (2.29)
## DHEA..nmol.L. (mean (SD)) 1.78 (1.27) 1.11 (0.92) 0.51 (0.46)
## cap_interval_mins (mean (SD)) 5.12 (9.20) 50.11 (34.97) 323.41 (79.63)
## booklet_interval_mins (mean (SD)) 0.90 (3.71) 34.40 (9.55) 322.85 (84.87)
## Stratified by Collection.Sample
## 4 p test
## n 93
## Cortisol..nmol.L. (mean (SD)) 4.15 (10.32) <0.001
## DHEA..nmol.L. (mean (SD)) 0.50 (0.65) <0.001
## cap_interval_mins (mean (SD)) 658.86 (99.07) <0.001
## booklet_interval_mins (mean (SD)) 641.50 (84.19) <0.001
summary(tab1)
##
## ### Summary of continuous variables ###
##
## Collection.Sample: 1
## n miss p.miss mean sd median p25 p75 min max skew kurt
## Cortisol..nmol.L. 93 0 0 7.4 6 5 3.9 8 0.5 29 2 4
## DHEA..nmol.L. 93 0 0 1.8 1 2 0.8 2 0.1 5 1 1
## cap_interval_mins 93 16 17 5.1 9 3 1.0 8 -11.0 42 2 6
## booklet_interval_mins 93 5 5 0.9 4 0 0.0 0 -12.0 20 3 13
## ------------------------------------------------------------
## Collection.Sample: 2
## n miss p.miss mean sd median p25 p75 min max skew
## Cortisol..nmol.L. 93 1 1 9 4.9 8.8 5.9 11 0.5 26 0.8
## DHEA..nmol.L. 93 1 1 1 0.9 0.9 0.5 1 0.1 5 2.5
## cap_interval_mins 93 23 25 50 35.0 37.0 31.2 52 23.0 202 2.7
## booklet_interval_mins 93 6 6 34 9.5 30.0 30.0 35 20.0 79 2.4
## kurt
## Cortisol..nmol.L. 1
## DHEA..nmol.L. 8
## cap_interval_mins 8
## booklet_interval_mins 7
## ------------------------------------------------------------
## Collection.Sample: 3
## n miss p.miss mean sd median p25 p75 min max
## Cortisol..nmol.L. 93 2 2 3.2 2.3 2.8 1.4 4.2 0.5 13
## DHEA..nmol.L. 93 2 2 0.5 0.5 0.4 0.2 0.6 0.1 3
## cap_interval_mins 93 12 13 323.4 79.6 313.0 257.0 376.0 198.0 603
## booklet_interval_mins 93 11 12 322.9 84.9 320.5 255.0 378.8 180.0 615
## skew kurt
## Cortisol..nmol.L. 1.5 3.8
## DHEA..nmol.L. 2.6 8.4
## cap_interval_mins 0.7 0.6
## booklet_interval_mins 1.0 1.6
## ------------------------------------------------------------
## Collection.Sample: 4
## n miss p.miss mean sd median p25 p75 min max
## Cortisol..nmol.L. 93 2 2 4.2 10.3 2.0 1.0 3.3 0.5 90
## DHEA..nmol.L. 93 2 2 0.5 0.6 0.3 0.2 0.5 0.1 4
## cap_interval_mins 93 10 11 658.9 99.1 616.0 599.5 685.5 543.0 954
## booklet_interval_mins 93 13 14 641.5 84.2 605.0 600.0 644.2 540.0 955
## skew kurt
## Cortisol..nmol.L. 7 54
## DHEA..nmol.L. 3 12
## cap_interval_mins 2 1
## booklet_interval_mins 2 4
##
## p-values
## pNormal pNonNormal
## Cortisol..nmol.L. 1.416499e-09 2.031791e-27
## DHEA..nmol.L. 2.440905e-24 4.399950e-28
## cap_interval_mins 5.815496e-186 4.338105e-62
## booklet_interval_mins 4.881277e-217 8.595342e-69
##
## Standardize mean differences
## average 1 vs 2 1 vs 3 1 vs 4 2 vs 3
## Cortisol..nmol.L. 0.6353966 0.2930992 0.9111042 0.3814094 1.5024767
## DHEA..nmol.L. 0.8041990 0.6030542 1.3359332 1.2684900 0.8408771
## cap_interval_mins 5.5064158 1.7601132 5.6151976 9.2923977 4.4439122
## booklet_interval_mins 6.5689517 4.6257992 5.3596781 10.7498466 4.7763662
## 2 vs 4 3 vs 4
## Cortisol..nmol.L. 0.5987101 0.125580048
## DHEA..nmol.L. 0.7716639 0.005175584
## cap_interval_mins 8.1945720 3.732301902
## booklet_interval_mins 10.1325320 3.769488381
Question 1: What is the agreement between the subject’s recordings of sampling times compared to the times recorded by an electric monitoring cap?
Record rates of missingness between the caplet time and booklet time
# Cap time distribution
ggplot(df_mod %>% filter(!is.na(cap_interval_mins)),
aes(cap_interval_mins)) +
geom_histogram(binwidth = 10, fill = "steelblue", alpha = 0.8) +
theme_minimal(base_size = 14) +
labs(
x = "Cap minutes since wake",
y = "Number of samples",
title = "Cap Sampling Interval Distribution"
)
#There are cap times with negative values, let's check how many
negative_captime <- df_mod %>% filter(!is.na(cap_interval_mins) & cap_interval_mins < 0)
table(negative_captime$cap_interval_mins)
##
## -11 -9 -6 -5 -4 -3 -2 -1
## 1 1 1 1 1 1 3 5
#14 cap times with negative values, this most likely means the participant recorded a wake time in their booklet later than they actually woke up
na_captime <- df_mod%>% filter(is.na(cap_interval_mins))
table(na_captime$cap_interval_mins, useNA = "ifany")
##
## <NA>
## 61
#61 missing cap_interval mins out of 366 observations
# Book time distribution
ggplot(df_mod %>% filter(!is.na(booklet_interval_mins)),
aes(booklet_interval_mins)) +
geom_histogram(binwidth = 10, fill = "steelblue", alpha = 0.8) +
theme_minimal(base_size = 14) +
labs(
x = "Booklet minutes since wake",
y = "Number of samples",
title = "Booklet Sampling Interval Distribution"
)
#2 book times with negative values
negative_booklettime <- df_mod %>% filter(!is.na(booklet_interval_mins) & booklet_interval_mins < 0)
table(negative_booklettime$booklet_interval_mins, useNA = "ifany")
##
## -12 -2
## 1 1
na_booklettime <- df_mod%>% filter(is.na(booklet_interval_mins))
table(na_booklettime$booklet_interval_mins, useNA = "ifany")
##
## <NA>
## 35
#35 missing booklet_interval_mins out of 366 observations
Removing observations with missing cap_interval_mins or missing booklet_interval_mins from df_mod for this part of the analysis
#Drop all observations where either of these values are missing
df_q1 <- filter(df_mod, !is.na(cap_interval_mins) & !is.na(booklet_interval_mins))
#Scatter plot of cap time (minutes) from waking up and booklet time (minutes) since waking up (continuous)
ggplot(df_q1, aes(x = cap_interval_mins, y = booklet_interval_mins)) +
geom_point(alpha = 0.6) +
labs(
x = "Cap interval (minutes since wake)",
y = "Booklet interval (minutes since wake)",
title = "Cap vs Booklet Minutes from Waking"
) +
geom_abline(slope = 1, intercept = 0, color = "blue") +
theme_minimal() +
coord_equal()
Find the difference between recorded sampling time and recorded cap time and print summary statistics
df_q1$booklet_cap_difference <- df_q1$booklet_interval_mins - df_q1$cap_interval_mins
summary(df_q1$booklet_cap_difference)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -200.000 -7.000 -1.000 -7.712 1.000 133.000
We have no NA values, as expected, and booklet time is often recorded earlier than cap time.
Plot distribution of the difference between booklet sampling time and cap sampling time
ggplot(df_q1,
aes(booklet_cap_difference)) +
geom_histogram(binwidth = 10, fill = "steelblue", alpha = 0.8) +
geom_text(
stat = "bin",
binwidth = 10,
aes(label = after_stat(count)),
vjust = -0.3,
size = 3
) +
theme_minimal(base_size = 14) +
labs(
x = "Booklet interval minus cap interval",
y = "Number of samples",
title = "Difference in Sampling Interval Distribution"
)
Booklet and Cap Time fall pretty close together with some extreme
outliers representing large differences between recorded booklet time
and cap time.
Comparing booklet and electric monitoring cap using a mixed model:
q1.lmm <- lmer(booklet_interval_mins ~ cap_interval_mins + (1|SubjectID),
data = df_q1)
summary(q1.lmm)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: booklet_interval_mins ~ cap_interval_mins + (1 | SubjectID)
## Data: df_q1
##
## REML criterion at convergence: 2787.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.9420 0.0024 0.1588 0.2635 4.3866
##
## Random effects:
## Groups Name Variance Std.Dev.
## SubjectID (Intercept) 44.11 6.642
## Residual 989.60 31.458
## Number of obs: 285, groups: SubjectID, 31
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -6.50779 2.91608 66.29918 -2.232 0.029 *
## cap_interval_mins 0.99494 0.00706 262.91666 140.931 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## cp_ntrvl_mn -0.641
confint(q1.lmm)
## Computing profile confidence intervals ...
## 2.5 % 97.5 %
## .sig01 0.0000000 12.2740368
## .sigma 28.8548052 34.3503012
## (Intercept) -12.2587408 -0.7985482
## cap_interval_mins 0.9810298 1.0087634
For every 1 minute increase in recorded cap time, recorded booklet time increases by 0.995 minutes (95% CI: 0.981, 1.008, p-value: <0.0001). This suggests there is strong agreement between recorded cap time and recorded booklet time. The intercept suggests there is bias though, subjects recorded sample collection in their booklet about 6.508 (95% CI: -12.259, -0.799, p-value: 0.029) minutes earlier than recorded sample collection by the electronic cap.
Visualizing fit
plot(q1.lmm)
qqnorm(residuals(q1.lmm))
qqline(residuals(q1.lmm))
The residuals are fairly close to 0 for most of the fitted values but there are a few observations with very large negative residuals. The qqplot shows the model is doing a good job fitting the majority of the data but there is strong deviation in the tails with some large negative and positive values.
Visualizing fitted vs observed values with identity line and fitted line
pred_q1 <- tibble(
cap_interval_mins = seq(min(df_q1$cap_interval_mins, na.rm = TRUE),
max(df_q1$cap_interval_mins, na.rm = TRUE),
length.out = 200)
)
pred_q1$booklet_pred <- predict(q1.lmm, newdata = pred_q1, re.form = NA)
ggplot(df_q1, aes(x = cap_interval_mins, y = booklet_interval_mins)) +
geom_point(alpha = 0.6) +
geom_line(
data = pred_q1,
aes(x = cap_interval_mins, y = booklet_pred),
inherit.aes = FALSE,
color = "red"
) +
labs(
x = "Cap interval (minutes since wake)",
y = "Booklet interval (minutes since wake)",
title = "Cap vs Booklet Minutes from Waking"
) +
geom_abline(slope = 1, intercept = 0, color = "blue") +
theme_minimal() +
coord_equal()
Question 2: Are subjects adhering accurately to the +30 min and +10 hour sampling times required by a protocol?
Adherence window 30 minutes (Collection 2) and 10 hours (Collection 4) after waking One analysis for good adherence = 7.5 minutes before or after (inclusive) - proportion of those who met this One analysis for adequate adherence = 15 minutes before or after (inclusive) - proportion of those who met this #Create collection time var to grab samples we’re interested in and booklet and cap deviation times from those collection windows
df_q2 <- filter(df_mod, Collection.Sample %in% c("2","4"))
df_q2 <- df_q2 %>%
mutate(
cap_good = case_when(
Collection.Sample == "2" ~ between(cap_interval_mins, 22.5, 37.5),
Collection.Sample == "4" ~ between(cap_interval_mins, 592.5, 607.5),
TRUE ~ NA
),
cap_adequate = case_when(
Collection.Sample == "2" ~ between(cap_interval_mins, 15, 45),
Collection.Sample == "4" ~ between(cap_interval_mins, 585, 615),
TRUE ~ NA
),
booklet_good = case_when(
Collection.Sample == "2" ~ between(booklet_interval_mins, 22.5, 37.5),
Collection.Sample == "4" ~ between(booklet_interval_mins, 592.5, 607.5),
TRUE ~ NA
),
booklet_adequate = case_when(
Collection.Sample == "2" ~ between(booklet_interval_mins, 15, 45),
Collection.Sample == "4" ~ between(booklet_interval_mins, 585, 615),
TRUE ~ NA
)
)
Tracking adherence
adherence_summary <- df_q2 %>%
group_by(Collection.Sample) %>%
summarise(
N = n(),
#cap
cap_good_n = sum(cap_good == TRUE, na.rm = TRUE),
cap_adequate_n = sum(cap_adequate == TRUE, na.rm = TRUE),
cap_missing_n = sum(is.na(cap_interval_mins)),
#booklet
booklet_good_n = sum(booklet_good == TRUE, na.rm = TRUE),
booklet_adequate_n = sum(booklet_adequate == TRUE, na.rm = TRUE),
booklet_missing_n = sum(is.na(booklet_interval_mins)),
.groups = "drop"
) %>%
mutate(
cap_good_pct = round(100 * cap_good_n / (N - cap_missing_n), 1),
cap_adequate_pct = round(100 * cap_adequate_n / (N - cap_missing_n), 1),
booklet_good_pct = round(100 * booklet_good_n / (N - booklet_missing_n), 1),
booklet_adequate_pct = round(100 * booklet_adequate_n / (N - booklet_missing_n), 1)
)
#Create a tidy table for report/presentation
adherence_table <- adherence_summary %>%
mutate(Collection.Sample = recode(Collection.Sample,
"2" = "30 min",
"4" = "10 hr")) %>%
rename(
`Sample Time` = Collection.Sample,
`N` = N,
`Cap Missing (n)` = cap_missing_n,
`Booklet Missing (n)` = booklet_missing_n,
`Cap Sample Good Adherence (%)` = cap_good_pct,
`Cap Sample Adequate Adherence (%)` = cap_adequate_pct,
`Booklet Sample Good Adherence (%)` = booklet_good_pct,
`Booklet Sample Adequate Adherence (%)` = booklet_adequate_pct
) %>%
dplyr::select(
`Sample Time`,
`N`,
`Cap Missing (n)`,
`Cap Sample Good Adherence (%)`,
`Cap Sample Adequate Adherence (%)`,
`Booklet Missing (n)`,
`Booklet Sample Good Adherence (%)`,
`Booklet Sample Adequate Adherence (%)`
)
kable(adherence_table, caption = "Cap and Booklet Adherence by Collection Sample")
| Sample Time | N | Cap Missing (n) | Cap Sample Good Adherence (%) | Cap Sample Adequate Adherence (%) | Booklet Missing (n) | Booklet Sample Good Adherence (%) | Booklet Sample Adequate Adherence (%) |
|---|---|---|---|---|---|---|---|
| 30 min | 93 | 23 | 52.9 | 71.4 | 6 | 78.2 | 89.7 |
| 10 hr | 93 | 10 | 32.5 | 39.8 | 13 | 46.2 | 56.2 |
Calculate deviation (minutes) by sample for visualization
df_q2 <- df_q2 %>%
mutate(
target_mins = case_when(
Collection.Sample == "2" ~ 30,
Collection.Sample == "4" ~ 60*10,
TRUE ~ NA_real_
),
cap_dev_mins = cap_interval_mins - target_mins,
booklet_dev_mins = booklet_interval_mins - target_mins
)
Cap Adherence Deviation Visualization
ggplot(df_q2 %>% filter(!is.na(cap_dev_mins)),
aes(x = cap_dev_mins)) +
geom_histogram(binwidth = 10, fill = "steelblue") +
geom_vline(xintercept = 0) +
geom_vline(xintercept = c(-7.5, 7.5), linetype = "dashed") +
geom_vline(xintercept = c(-15, 15), linetype = "dotted") +
theme_minimal() +
labs(
x = "Cap deviation from target (minutes)",
y = "Count",
title = "Cap Sampling Deviations"
)
Booklet Adherence Deviation
ggplot(df_q2 %>% filter(!is.na(booklet_dev_mins)),
aes(x = booklet_dev_mins)) +
geom_histogram(binwidth = 10, fill = "steelblue") +
geom_vline(xintercept = 0) +
geom_vline(xintercept = c(-7.5, 7.5), linetype = "dashed") +
geom_vline(xintercept = c(-15, 15), linetype = "dotted") +
theme_minimal() +
labs(
x = "Booklet deviation from target (minutes)",
y = "Count",
title = "Booklet Sampling Deviations"
)
Let’s see whether cap deviations vary by sample:
ggplot(df_q2 %>% filter(!is.na(cap_dev_mins)),
aes(factor(Collection.Sample), cap_dev_mins)) +
geom_boxplot() +
theme_minimal(base_size = 14) +
labs(
x = "Sample",
y = "Cap deviation (minutes)",
title = "Cap Deviations by Sample"
)
Let’s see whether bookletdeviations vary by sample:
ggplot(df_q2 %>% filter(!is.na(booklet_dev_mins)),
aes(factor(Collection.Sample), booklet_dev_mins)) +
geom_boxplot() +
theme_minimal(base_size = 14) +
labs(
x = "Sample",
y = "Booklet deviation (minutes)",
title = "Booklet Deviations by Sample"
)
Visualize cap and booklet times together
dfq2_long <- df_q2 %>%
pivot_longer(
cols = c(cap_dev_mins, booklet_dev_mins),
names_to = "method",
values_to = "dev_mins"
) %>%
filter(!is.na(dev_mins)) %>%
mutate(
method = recode(method,
cap_dev_mins = "Cap",
booklet_dev_mins = "Booklet"),
Collection.Sample = factor(Collection.Sample)
)
ggplot(dfq2_long, aes(Collection.Sample, dev_mins, fill = method)) +
geom_boxplot(position = position_dodge(width = 0.75)) +
geom_hline(yintercept = 0, linetype = "dashed") +
theme_minimal(base_size = 14) +
labs(
x = "Sample",
y = "Deviation (minutes)",
title = "Cap and Booklet Deviations by Sample",
fill = "Method"
)
Question 3: What are the patterns of change over the course of the day? What is the investigator interested in knowing about changes over time? The investigator stated that levels of cortisol and DHEA are expected to increase sharply from waking to 30 minutes after waking and then decline more slowly over the rest of the day. She was interested in knowing if the SPIT test shows this expected pattern. She wanted to know: 1) is there an increase in cortisol and/or DHEA from waking to 30 minutes post waking? And 2) what is the rate of decline in cortisol and DHEA after 30 minutes from waking? She also stated that even if there are not statistically significant differences, she would like to know the estimates for the initial increase and rates of decline so she can compare them to what is known from other more standard hormone tests.
Biological Range of Data Cortisol > 26 keep but weird, Cortisol > 80 incorrect - probably a laboratory error DHEA upper limit on assay is 5.205 (max value in dataset - exclude these) - probably an error unless this is recorded repeatedly by the same individual (probably exclude these individuals bc they have a different process going on)
Outliers: Check no cortisol > 80, check no DHEA >5.205
#One outlier with cortisol value > 80 - remove this
df_q3 <- filter(df_mod, Cortisol..nmol.L. < 80)
#DHEA
x <- filter(df_q3,DHEA..nmol.L. == 5.205)
#SubjectID 3037 needs to be be removed as it is at the upper limit multiple times, suggesting something is wrong
df_q3 <- filter(df_q3, SubjectID != 3037)
# Remove DHEA levels that are at the lab maximum
df_q3 <- filter(df_q3,DHEA..nmol.L. < 5.205000)
#Removing large Cortisol outlier
df_q3 <- filter(df_q3, Cortisol..nmol.L. < 30)
#Visualize cortisol
hist(df_q3$Cortisol..nmol.L., main = "Histogram of Cortisol (nmol/L)", xlab = "Cortisol (nmol/L)")
#Heavy right skew - let's look at a log transformation
hist(log(df_q3$Cortisol..nmol.L.), main = "Histogram of Cortisol (nmol/L)", xlab = "Cortisol (nmol/L)")
#Visualizing DHEA
hist(df_q3$DHEA..nmol.L., main = "Histogram of DHEA (nmol/L)", xlab = "DHEA (nmol/L)")
#Again heavy right skew - let's look at the log transform
hist(log(df_q3$DHEA..nmol.L.), main = "Histogram of DHEA (nmol/L)", xlab = "DHEA (nmol/L)")
summary(df_q3$DHEA..nmol.L.)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1076 0.2976 0.5768 0.8991 1.2257 4.7993
summary(df_q3$Cortisol..nmol.L.)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5242 2.0417 4.0833 5.5724 7.5045 29.2178
Significant outlier of 37, let’s remove this
ggplot(df_q3, aes(x = cap_interval_mins, y = Cortisol..nmol.L., group = SubjectID, color = factor(SubjectID))) +
geom_line() +
theme_minimal(base_size = 14) +
labs(
x = "Minutes since waking (Book)",
y = "Cortisol (nmol/L)",
title = "Cortisol Over Time Since Waking"
)
## Warning: Removed 54 rows containing missing values or values outside the scale range
## (`geom_line()`).
Subject 3024 has elevated DHEA levels
ggplot(df_q3, aes(x = cap_interval_mins, y = DHEA..nmol.L., group = SubjectID, color = factor(SubjectID))) +
geom_line() +
theme_minimal(base_size = 14) +
labs(
x = "Minutes since waking (Book)",
y = "DHEA (nmol/L)",
title = "DHEA Over Time Since Waking"
)
## Warning: Removed 54 rows containing missing values or values outside the scale range
## (`geom_line()`).
Let’s see if DHEA even needs a changepoint like cortisol
ggplot(df_q3, aes(x = cap_interval_mins, y = DHEA..nmol.L., group = SubjectID, color = factor(SubjectID))) +
geom_line() +
theme_minimal(base_size = 14) +
labs(
x = "Minutes since waking (Book)",
y = "DHEA (nmol/L)",
title = "DHEA Over Time Since Waking"
) +
coord_cartesian(xlim = c(0, 120))
## Warning: Removed 54 rows containing missing values or values outside the scale range
## (`geom_line()`).
Changepoint at 30 might make sense let’s investigate further
dhea_peak_bysubj <- df_q3 %>%
filter(!is.na(DHEA..nmol.L.), !is.na(booklet_interval_mins)) %>%
group_by(SubjectID) %>%
slice_max(DHEA..nmol.L., with_ties = FALSE) %>%
summarise(
peak_minute = booklet_interval_mins,
peak_dhea = DHEA..nmol.L.,
.groups = "drop"
)
hist(dhea_peak_bysubj$peak_minute, main = "Histogram of Peak DHEA Timing (Booklet Minutes since Waking)")
Create variable for 30 minute knot and let’s proceed with logging DHEA and Cortisol
#Modified from BIOS 6643 code- changepoint creation
df_q3 <- df_q3 %>%
mutate(
wake_30 = pmin(booklet_interval_mins, 30),
after_30 = pmax(booklet_interval_mins - 30, 0),
log_cortisol = log(Cortisol..nmol.L.),
log_dhea = log(DHEA..nmol.L.)
)
Piecewise linear regression (lmm w random slope for subject) for Cortisol
q3_cortisol.lmm <- lmer(
log_cortisol ~ wake_30 + after_30 + (1 | SubjectID),
data = df_q3
)
summary(q3_cortisol.lmm)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log_cortisol ~ wake_30 + after_30 + (1 | SubjectID)
## Data: df_q3
##
## REML criterion at convergence: 754
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2425 -0.5052 0.0558 0.5978 2.9472
##
## Random effects:
## Groups Name Variance Std.Dev.
## SubjectID (Intercept) 0.1175 0.3428
## Residual 0.5085 0.7131
## Number of obs: 320, groups: SubjectID, 30
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.683e+00 1.021e-01 8.903e+01 16.483 <2e-16 ***
## wake_30 6.239e-03 3.580e-03 2.881e+02 1.743 0.0825 .
## after_30 -2.282e-03 1.809e-04 2.916e+02 -12.614 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) wak_30
## wake_30 -0.592
## after_30 0.002 -0.499
cortisol_table <- tidy(q3_cortisol.lmm, effects = "fixed", conf.int = TRUE)
cortisol_table <- cortisol_table %>%
mutate(
estimate_exp = exp(estimate),
conf.low_exp = exp(conf.low),
conf.high_exp = exp(conf.high)
)
cortisol_table_exp <- cortisol_table %>%
dplyr::select(term, estimate_exp, conf.low_exp, conf.high_exp, p.value) %>%
rename(
Term = term,
`Rate Ratio` = estimate_exp,
`95% CI Lower` = conf.low_exp,
`95% CI Upper` = conf.high_exp,
`p-value` = p.value
)
kable(cortisol_table_exp, digits = 3,
caption = "Linear Mixed Model of Log DHEA on Minutes Since Waking - Raw Estimates")
| Term | Rate Ratio | 95% CI Lower | 95% CI Upper | p-value |
|---|---|---|---|---|
| (Intercept) | 5.381 | 4.393 | 6.592 | 0.000 |
| wake_30 | 1.006 | 0.999 | 1.013 | 0.082 |
| after_30 | 0.998 | 0.997 | 0.998 | 0.000 |
kable(cortisol_table_exp, digits = 3,
caption = "Linear Mixed Model of Log DHEA on Minutes Since Waking - Transformed Estimates")
| Term | Rate Ratio | 95% CI Lower | 95% CI Upper | p-value |
|---|---|---|---|---|
| (Intercept) | 5.381 | 4.393 | 6.592 | 0.000 |
| wake_30 | 1.006 | 0.999 | 1.013 | 0.082 |
| after_30 | 0.998 | 0.997 | 0.998 | 0.000 |
Piecewise linear regression (lmm w random slope for subject) for DHEA
q3_dhea.lmm <- lmer(
log_dhea ~ booklet_interval_mins + (1 | SubjectID),
data = df_q3
)
summary(q3_dhea.lmm)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log_dhea ~ booklet_interval_mins + (1 | SubjectID)
## Data: df_q3
##
## REML criterion at convergence: 695.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.1521 -0.5531 -0.0212 0.6078 2.5318
##
## Random effects:
## Groups Name Variance Std.Dev.
## SubjectID (Intercept) 0.2549 0.5049
## Residual 0.4035 0.6352
## Number of obs: 320, groups: SubjectID, 30
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -3.943e-02 1.047e-01 3.559e+01 -0.376 0.709
## booklet_interval_mins -1.986e-03 1.365e-04 2.926e+02 -14.552 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## bklt_ntrvl_ -0.319
Get results in an easy to read format
dhea_table <- tidy(q3_dhea.lmm, effects = "fixed", conf.int = TRUE)
dhea_table <- dhea_table %>%
mutate(
estimate_exp = exp(estimate),
conf.low_exp = exp(conf.low),
conf.high_exp = exp(conf.high)
)
dhea_table_exp <- dhea_table %>%
dplyr::select(term, estimate_exp, conf.low_exp, conf.high_exp, p.value) %>%
rename(
Term = term,
`Rate Ratio` = estimate_exp,
`95% CI Lower` = conf.low_exp,
`95% CI Upper` = conf.high_exp,
`p-value` = p.value
)
kable(dhea_table, digits = 3,
caption = "Linear Mixed Model of Log DHEA on Minutes Since Waking - Raw Estimates")
| effect | term | estimate | std.error | statistic | df | p.value | conf.low | conf.high | estimate_exp | conf.low_exp | conf.high_exp |
|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed | (Intercept) | -0.039 | 0.105 | -0.376 | 35.587 | 0.709 | -0.252 | 0.173 | 0.961 | 0.777 | 1.189 |
| fixed | booklet_interval_mins | -0.002 | 0.000 | -14.552 | 292.558 | 0.000 | -0.002 | -0.002 | 0.998 | 0.998 | 0.998 |
kable(dhea_table_exp, digits = 3,
caption = "Linear Mixed Model of Log DHEA on Minutes Since Waking - Transformed Estimates")
| Term | Rate Ratio | 95% CI Lower | 95% CI Upper | p-value |
|---|---|---|---|---|
| (Intercept) | 0.961 | 0.777 | 1.189 | 0.709 |
| booklet_interval_mins | 0.998 | 0.998 | 0.998 | 0.000 |
From resource on canvas: If this is a value great than 1, you can subtract 1 and multiply by 100 to get the % increase associated with a 1-unit increase in X1.
If this value is less than 1, you can subtract 1 and multiple by 100 to get the % decrease associated with a 1-unit increase in X1.
This means that exp(beta1) can be roughly interpreted as the percentage change in of Y with a one unit increase in X1.
Visualizing the fit of Cortisol lmm
pred_df <- tibble(
booklet_interval_mins = seq(min(df_q3$booklet_interval_mins, na.rm = TRUE),
max(df_q3$booklet_interval_mins, na.rm = TRUE),
length.out = 200)
) %>%
mutate(
wake_30 = pmin(booklet_interval_mins, 30),
after_30 = pmax(booklet_interval_mins - 30, 0)
)
pred_df$pred_log_cort <- predict(q3_cortisol.lmm, newdata = pred_df, re.form = NA)
# Back-transform to nmol/L
pred_df$pred_cort <- exp(pred_df$pred_log_cort)
ggplot(df_q3,
aes(x = booklet_interval_mins,
y = Cortisol..nmol.L.,
group = SubjectID,
color = factor(SubjectID))) +
geom_line() +
geom_line(
data = pred_df,
aes(x = booklet_interval_mins, y = pred_cort),
inherit.aes = FALSE,
linewidth = 1,
color = "black"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "none") +
labs(
x = "Minutes since waking (cap)",
y = "Cortisol (nmol/L)",
title = "Cortisol Over Time Since Waking (with Model Fit)"
)
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_line()`).
DHEA
pred_dhea <- tibble(
booklet_interval_mins = seq(min(df_q3$booklet_interval_mins, na.rm = TRUE),
max(df_q3$booklet_interval_mins, na.rm = TRUE),
length.out = 200)
)
pred_dhea$pred_log_dhea <- predict(q3_dhea.lmm, newdata = pred_dhea, re.form = NA)
# Back-transform to nmol/L
pred_dhea$pred_dhea <- exp(pred_dhea$pred_log_dhea)
ggplot(df_q3,
aes(x = booklet_interval_mins,
y = DHEA..nmol.L.,
group = SubjectID,
color = factor(SubjectID))) +
geom_line() +
geom_line(
data = pred_dhea,
aes(x = booklet_interval_mins, y = pred_dhea),
inherit.aes = FALSE,
linewidth = 1,
color = "black"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "none") +
labs(
x = "Minutes since waking (booklet)",
y = "DHEA (nmol/L)",
title = "DHEA Over Time Since Waking (Linear Mixed Model Fit)"
)
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_line()`).